Integrative analysis reveals therapeutic potential of pyrviniym pamoate in Merkel cell carcinoma
MCC signature genes (MCC1000) reversal analysis in LINCS L1000 dataset
library(cmapR)
library(ggplot2)
library(ggrepel)
library(dplyr)
library(circlize)
library(RColorBrewer)
library(migest)Importing data
#load L1000 files===============
DIR<-c("/xdisk/mpadi/jiawenyang/data/merkel_cell/pp_in_mcc_paper/l1000/")
FILES<-list.files(path = paste0(DIR, "WNT_inhibitor"), pattern = "*_info.txt", full.names = T, recursive = T)
#FILES<-list.files(path = paste0(DIR, "WNT_related_conditions"), pattern = "*_info.txt", full.names = T, recursive = T)
gene_info<-read.table(file = paste0(DIR, "GSE92742_Broad_LINCS_gene_info.txt"), sep = "\t", quote = "", header = T)
cell_info<-read.delim2(file = paste0(DIR, "GSE92742_Broad_LINCS_cell_info.txt"), sep = "\t", quote = "", header = T)
colnames<-read.table(file = paste0(DIR, "colnames.txt"))
#load gse39612 data set ====================
gse39612_renormalized_edat<-readRDS(file = "/xdisk/mpadi/jiawenyang/data/merkel_cell/pp_in_mcc_paper/gse39612/renormalized_samples_edat_no_celllines.RDS")
#load NE marker gene list ================
neuroendocrine_markers<-c("ENO2", "NEFM", "NEFH", "NMB", "HES6", "SOX2", "ATOH1", "CHGA")Performing Pearson Correlation analysis with GSE39612 data: correlat the expression value of all genes with NE marker genes
gse39612_cor<-readRDS(file = "/xdisk/mpadi/jiawenyang/data/merkel_cell/pp_in_mcc_paper/gse39612/gse39612_cor.RDS")
gse39612_NE_cor<-gse39612_cor[neuroendocrine_markers,]
NE_score<-apply(t(gse39612_NE_cor), 1, sum)
NE_score_sorted<-NE_score[order(NE_score, decreasing = T)]
MCC1000<-rbind(data.frame(MCC_score = head(NE_score_sorted, 500)), data.frame(MCC_score = tail(NE_score_sorted, 500)))Overlapping the MCC1000 genes with perturbed top 1000 genes in LINCS L1000 data under different perturbagens
n=500
gse39612_high_NE_score_genes<-names(NE_score_sorted[1:n])
gse39612_low_NE_score_genes<-names(rev(NE_score_sorted)[1:n])
annotate_genes<-list()
for (i in 1:length(FILES)){
RDS_FILES<-list.files(path = paste0(DIR, "WNT_inhibitor"), pattern = "*.RDS", full.names = T, recursive = T)
level4_all<-readRDS(RDS_FILES[i])
drug_lable<-strsplit(strsplit(RDS_FILES[i], "/")[[1]][10], "_")[[1]][1]
gene_info$pr_gene_id <- as.character(gene_info$pr_gene_id)
level4_matrix<-mat(level4_all)
level4_zscore<-as.data.frame(level4_matrix)
level4_zscore$pr_gene_id<-rownames(level4_zscore)
level4_zscore<-left_join(level4_zscore, gene_info, by = "pr_gene_id")
rownames(level4_zscore)<-level4_zscore$pr_gene_symbol
level4_zscore_analysis<-level4_zscore[, 1:(ncol(level4_zscore)-5)]
level4_zscore_analysis$mean<-apply(level4_zscore_analysis, 1, mean)
level4_zscore_analysis$gene<-rownames(level4_zscore_analysis)
#Organize l1000 level4 sample info data ============================
samples_info<-read.table(file = FILES[i] ,sep = "\t", header = F)
colnames(samples_info)<-colnames
group_all=samples_info[,c("sample_id" ,"pert_itime", "pert_idose", "cell_iname" , "project_code")]
rownames(group_all)<-samples_info$sample_id
group_all<-group_all[colnames(level4_zscore),]
group_all<-na.omit(group_all)
cell_info_df<-cell_info[,c("base_cell_id", "primary_site", "subtype", "original_growth_pattern")]
colnames(cell_info_df)<-c("cell_iname","primary_site", "subtype", "original_growth_pattern")
group_all_new<-left_join(group_all, cell_info_df, by = "cell_iname", keep = F)
group_all<-group_all_new[!duplicated(group_all_new$sample_id),]
group_all$dosage<-unlist(lapply(group_all$pert_idose, function(x) strsplit(x, " ")[[1]][1]))
group_all<-group_all[group_all$dosage <= 10,]
L1000_zscored_fc<-level4_zscore_analysis
L1000_sample_info<-group_all
all_fc_edat<-L1000_zscored_fc #total 12328 genes
df<-all_fc_edat
L1000_top_downregulated_genes_all<-all_fc_edat[order(all_fc_edat$mean, decreasing = F),][1:n,]
L1000_top_upregulated_genes_all<-all_fc_edat[order(all_fc_edat$mean, decreasing = T),][1:n,]
L1000_gene_set<-c(rownames(L1000_top_downregulated_genes_all), rownames(L1000_top_upregulated_genes_all))
L1000_null_gene_set<-rownames(df)[!rownames(df) %in% L1000_gene_set]
MCC1000_gene_set<-c(gse39612_high_NE_score_genes, gse39612_low_NE_score_genes)
MCC1000_null_gene_set<-names(NE_score)[!names(NE_score) %in% MCC1000_gene_set]
annotate_down<-intersect(gse39612_high_NE_score_genes, rownames(L1000_top_downregulated_genes_all))
annotate_up<-intersect(gse39612_low_NE_score_genes, rownames(L1000_top_upregulated_genes_all))
annotate_gene_set<-c(annotate_down, annotate_down)
annotate_genes[[drug_lable]]<-annotate_gene_set
print(annotate_gene_set)
annotate_down_df<-all_fc_edat[annotate_down,]
annotate_up_df<-all_fc_edat[annotate_up,]
# Perform fisher's exact =========
dat<- data.frame(
"MCC1000_yes" = c(length(intersect(MCC1000_gene_set, L1000_gene_set)), length(intersect(MCC1000_gene_set, L1000_null_gene_set))),
"MCC1000_no" = c(length(intersect(MCC1000_null_gene_set, L1000_gene_set)), length(intersect(MCC1000_null_gene_set, L1000_null_gene_set))),
row.names = c("l1000_yes", "l1000_no"),
stringsAsFactors = FALSE)
print(drug_lable)
test <- fisher.test(dat)
print(test)
#plot for each drug ==========
df<-na.omit(df)
p1 = ggplot(data = df,
aes(x=reorder(gene, mean), y=mean),
label = gene)+
geom_point(color = "grey75",
size = 5
)+
geom_point(data = annotate_down_df , # New layer containing data subset il_genes
size = 3,
shape = 21,
fill = "firebrick",
colour = "black")+
geom_text_repel(data = annotate_down_df, aes(label = gene), size = 6, fontface = "bold", colour = "black", force = 3, max.overlaps = 70) +
geom_point(data = annotate_up_df , # New layer containing data subset il_genes
size = 3,
shape = 21,
fill = "blue",
colour = "black")+
geom_text_repel(data = annotate_up_df, aes(label = gene), size = 6, fontface = "bold", colour = "black", force = 3, max.overlaps = 70) +
ylab(paste0("Mean of Z-scored fold change of all cell lines"))+
xlab("Genes")+
scale_x_discrete(expand = c(0.15,0))+
lims(y = c(-7.5, 5))+
ggtitle(paste0("L1000 top dysregulated genes overlapping with GSE39612 MCC1000 genes \n -", drug_lable))+
theme(axis.text.x = element_blank(),
plot.title = element_text(size = 30, face = "bold"),
panel.background = element_blank(),
axis.line.y = element_line(),
#plot.margin = unit(c(1,1,1,1), "cm"),
axis.title.x = element_text(size=30, face="bold"),
axis.title.y = element_text(size=30, face="bold"),
axis.text.y = element_text(size=30, face="bold"))
plot(p1)
}## [1] "PARP1" "TYMS" "CDC45" "TIMELESS" "PPM1G" "CTPS1"
## [7] "ZWINT" "LIG1" "KIF20A" "EEF1A2" "ATF5" "UCHL1"
## [13] "CCNB1" "EEF1E1" "CCNB2" "TOP2A" "RAD51C" "PCNA"
## [19] "URB2" "SUGP2" "CDC20" "TXNRD1" "CDC25B" "FAM216A"
## [25] "PARP1" "TYMS" "CDC45" "TIMELESS" "PPM1G" "CTPS1"
## [31] "ZWINT" "LIG1" "KIF20A" "EEF1A2" "ATF5" "UCHL1"
## [37] "CCNB1" "EEF1E1" "CCNB2" "TOP2A" "RAD51C" "PCNA"
## [43] "URB2" "SUGP2" "CDC20" "TXNRD1" "CDC25B" "FAM216A"
## [1] "IWR-1-ENDO"
##
## Fisher's Exact Test for Count Data
##
## data: dat
## p-value = 0.0967
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.9521115 1.5992295
## sample estimates:
## odds ratio
## 1.241408
## [1] "FEN1" "MCM6" "PARP1" "MCM2" "STMN1" "FANCI" "BUB1B"
## [8] "TYMS" "RRM2" "TRIP13" "DEPDC1" "CDC25A" "LPCAT1" "BUB1"
## [15] "LBR" "AURKA" "ZWINT" "DLGAP5" "HJURP" "GINS1" "LIG1"
## [22] "H2AFX" "MCM10" "MKI67" "KIF4A" "UBE2C" "KIF20A" "GTSE1"
## [29] "DDX39A" "EEF1A2" "ATF5" "MCM4" "CCNB1" "BIRC5" "CCNB2"
## [36] "KIF18B" "TOP2A" "PRC1" "CDCA8" "NARF" "OIP5" "CCNA2"
## [43] "PTTG1" "HMMR" "NCAPH" "LMNB2" "TACC3" "PCNA" "PSRC1"
## [50] "ZNF324" "CHAF1A" "SNRNP25" "ASPM" "MCM5" "CDC20" "TPX2"
## [57] "TTK" "FEN1" "MCM6" "PARP1" "MCM2" "STMN1" "FANCI"
## [64] "BUB1B" "TYMS" "RRM2" "TRIP13" "DEPDC1" "CDC25A" "LPCAT1"
## [71] "BUB1" "LBR" "AURKA" "ZWINT" "DLGAP5" "HJURP" "GINS1"
## [78] "LIG1" "H2AFX" "MCM10" "MKI67" "KIF4A" "UBE2C" "KIF20A"
## [85] "GTSE1" "DDX39A" "EEF1A2" "ATF5" "MCM4" "CCNB1" "BIRC5"
## [92] "CCNB2" "KIF18B" "TOP2A" "PRC1" "CDCA8" "NARF" "OIP5"
## [99] "CCNA2" "PTTG1" "HMMR" "NCAPH" "LMNB2" "TACC3" "PCNA"
## [106] "PSRC1" "ZNF324" "CHAF1A" "SNRNP25" "ASPM" "MCM5" "CDC20"
## [113] "TPX2" "TTK"
## [1] "PRI-724"
##
## Fisher's Exact Test for Count Data
##
## data: dat
## p-value = 4.077e-05
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 1.299362 2.082383
## sample estimates:
## odds ratio
## 1.652257
## [1] "COL9A2" "CDKN2A" "TYMS" "MTHFD2" "CDC45" "DNAJC6"
## [7] "BRINP1" "EEF1A2" "CCNB1" "DNMT3A" "BCL2L11" "CCNB2"
## [13] "TTLL7" "PHC2" "PCNA" "EIF4EBP1" "CDC20" "PARD6A"
## [19] "HDAC9" "TOX" "COL9A2" "CDKN2A" "TYMS" "MTHFD2"
## [25] "CDC45" "DNAJC6" "BRINP1" "EEF1A2" "CCNB1" "DNMT3A"
## [31] "BCL2L11" "CCNB2" "TTLL7" "PHC2" "PCNA" "EIF4EBP1"
## [37] "CDC20" "PARD6A" "HDAC9" "TOX"
## [1] "XAV-939"
##
## Fisher's Exact Test for Count Data
##
## data: dat
## p-value = 0.0001013
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 1.272368 2.043034
## sample estimates:
## odds ratio
## 1.619585
## [1] "SOX2" "COL9A2" "PAFAH1B3" "STMN1" "CDC45" "CACNA2D2"
## [7] "LBR" "AURKA" "LIG1" "KIF20A" "KIF14" "BRINP1"
## [13] "EEF1A2" "SPAG5" "CCNB1" "UNC13A" "CCNB2" "TOP2A"
## [19] "CHGA" "SMC4" "NARF" "PCNA" "PSRC1" "KIF2C"
## [25] "EZH2" "CDC20" "NT5DC3" "YEATS2" "SOX2" "COL9A2"
## [31] "PAFAH1B3" "STMN1" "CDC45" "CACNA2D2" "LBR" "AURKA"
## [37] "LIG1" "KIF20A" "KIF14" "BRINP1" "EEF1A2" "SPAG5"
## [43] "CCNB1" "UNC13A" "CCNB2" "TOP2A" "CHGA" "SMC4"
## [49] "NARF" "PCNA" "PSRC1" "KIF2C" "EZH2" "CDC20"
## [55] "NT5DC3" "YEATS2"
## [1] "indirubin"
##
## Fisher's Exact Test for Count Data
##
## data: dat
## p-value = 0.002835
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 1.132772 1.852121
## sample estimates:
## odds ratio
## 1.455863
## [1] "PARP1" "UBE2S" "PAFAH1B3" "CDC45" "KIF20A" "DTYMK"
## [7] "DDX39A" "KIF14" "EEF1A2" "ATF5" "CCNB1" "CCNB2"
## [13] "PCNA" "EIF4EBP1" "CDC20" "TXNRD1" "CDC25B" "PARP1"
## [19] "UBE2S" "PAFAH1B3" "CDC45" "KIF20A" "DTYMK" "DDX39A"
## [25] "KIF14" "EEF1A2" "ATF5" "CCNB1" "CCNB2" "PCNA"
## [31] "EIF4EBP1" "CDC20" "TXNRD1" "CDC25B"
## [1] "mesalazine"
##
## Fisher's Exact Test for Count Data
##
## data: dat
## p-value = 0.6257
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.8019211 1.3939069
## sample estimates:
## odds ratio
## 1.064963
## [1] "FEN1" "MCM6" "PARP1" "UBE2S" "PAFAH1B3" "MCM2"
## [7] "STMN1" "FANCI" "NETO2" "RNASEH2A" "TYMS" "GINS2"
## [13] "CDC45" "TIMELESS" "NCAPG" "RRM2" "ESPL1" "TRIP13"
## [19] "NCAPG2" "DEPDC1" "NEK2" "CDC25A" "CTPS1" "LPCAT1"
## [25] "FOXM1" "BUB1" "LBR" "AURKA" "ZWINT" "DLGAP5"
## [31] "CDC7" "HJURP" "GINS1" "LIG1" "H2AFX" "MCM10"
## [37] "MKI67" "KIF4A" "UBE2C" "KIF20A" "DTYMK" "GTSE1"
## [43] "TRMT5" "FBXO5" "KIF14" "CENPE" "CKS2" "CENPM"
## [49] "SPAG5" "CCNB1" "BIRC5" "NDC80" "CCNB2" "MELK"
## [55] "KIF18B" "TOP2A" "MYBL2" "ASF1B" "PRC1" "CDCA8"
## [61] "SMC4" "XPO6" "OIP5" "CCNA2" "NEIL3" "PTTG1"
## [67] "POLE2" "HMMR" "NCAPH" "TACC3" "PCNA" "PSRC1"
## [73] "KIF18A" "KIF2C" "SNRNP25" "EZH2" "ASPM" "MCM5"
## [79] "CDC20" "SPC25" "TPX2" "CDC25B" "CEP55" "TTK"
## [85] "FEN1" "MCM6" "PARP1" "UBE2S" "PAFAH1B3" "MCM2"
## [91] "STMN1" "FANCI" "NETO2" "RNASEH2A" "TYMS" "GINS2"
## [97] "CDC45" "TIMELESS" "NCAPG" "RRM2" "ESPL1" "TRIP13"
## [103] "NCAPG2" "DEPDC1" "NEK2" "CDC25A" "CTPS1" "LPCAT1"
## [109] "FOXM1" "BUB1" "LBR" "AURKA" "ZWINT" "DLGAP5"
## [115] "CDC7" "HJURP" "GINS1" "LIG1" "H2AFX" "MCM10"
## [121] "MKI67" "KIF4A" "UBE2C" "KIF20A" "DTYMK" "GTSE1"
## [127] "TRMT5" "FBXO5" "KIF14" "CENPE" "CKS2" "CENPM"
## [133] "SPAG5" "CCNB1" "BIRC5" "NDC80" "CCNB2" "MELK"
## [139] "KIF18B" "TOP2A" "MYBL2" "ASF1B" "PRC1" "CDCA8"
## [145] "SMC4" "XPO6" "OIP5" "CCNA2" "NEIL3" "PTTG1"
## [151] "POLE2" "HMMR" "NCAPH" "TACC3" "PCNA" "PSRC1"
## [157] "KIF18A" "KIF2C" "SNRNP25" "EZH2" "ASPM" "MCM5"
## [163] "CDC20" "SPC25" "TPX2" "CDC25B" "CEP55" "TTK"
## [1] "pyrvinium"
##
## Fisher's Exact Test for Count Data
##
## data: dat
## p-value = 8.993e-09
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 1.571251 2.459727
## sample estimates:
## odds ratio
## 1.973272
Performing pairwise intersection on overlapped genes in different Wnt-perturbing conditions
Organizing data
fset<-unique(unlist(annotate_genes))
#pairwise intersection
conditions<-unlist(lapply(RDS_FILES, function(x) strsplit(strsplit(x, "/")[[1]][10], "_")[[1]][1]))
combs<-combn(length(conditions), 2)
summary<-data.frame()
for (i in 1:ncol(combs)){
pair_id<-combs[,i]
set1<-annotate_genes[[pair_id[1]]]
set2<-annotate_genes[[pair_id[2]]]
set1id<-conditions[pair_id[1]]
set2id<-conditions[pair_id[2]]
int<-intersect(set1, set2)
df<-data.frame(set1 = set1id, set2 = set2id, intersection = paste(int, collapse = "/"), gene_count = length(int))
#df<-data.frame(set1 = set1id, set2 = set2id, intersection = int, gene_count = length(int))
summary<-rbind(summary, df)
}
summary$set1<-as.factor(summary$set1)
summary$set2<-as.factor(summary$set2)Generating circos plot from the pariwise intersection
### Make data
colors<-brewer.pal(n = length(conditions), "Set2")
df1_new<-data.frame(order = 1:length(conditions),
conditions = conditions,
rcol = colors,
lcol = paste0(colors, "65"),
xmin = rep(0, length(conditions)),
xmax = unlist(lapply(annotate_genes, function(x) length(x))))
### Plot sectors (outer part)
par(mar=rep(0,4))
circos.clear()
### Basic circos graphic parameters
circos.par(cell.padding=c(0,0,0,0), track.margin=c(0,0.15), start.degree = 90, gap.degree = 4)
### Sector details
circos.initialize(factors = df1_new$conditions, xlim = cbind(df1_new$xmin, df1_new$xmax))
### Plot sectors
circos.trackPlotRegion(ylim = c(0, 1), factors = df1_new$conditions, track.height=0.1,
#panel.fun for each sector
panel.fun = function(x, y) {
#select details of current sector
name = get.cell.meta.data("sector.index")
i = get.cell.meta.data("sector.numeric.index")
xlim = get.cell.meta.data("xlim")
ylim = get.cell.meta.data("ylim")
#text direction (dd) and adjusmtents (aa)
theta = circlize(mean(xlim), 1.3)[1, 1] %% 360
dd <- ifelse(theta < 90 || theta > 270, "clockwise", "reverse.clockwise")
aa = c(1, 0.5)
if(theta < 90 || theta > 270) aa = c(0, 0.5)
#plot conditions labels
circos.text(x=mean(xlim), y=1.7, labels=name, facing = dd, cex=1.7, font=2, adj = aa)
#plot main sector
circos.rect(xleft=xlim[1], ybottom=ylim[1], xright=xlim[2], ytop=ylim[2],
col = df1_new$rcol[i], border=df1_new$rcol[i])
#plot axis
circos.axis(labels.cex=1.5, direction = "outside", major.at=seq(from=0,to=floor(df1_new$xmax)[i], by = floor(df1_new$xmax)[i])
,minor.ticks=0
)
})
### Plot links (inner part)
### Add sum values to df1, marking the x-position of the first links
### out (sum1) and in (sum2). Updated for further links in loop below.
df1_new$sum1 <- df1_new$xmax
df1_new$sum2 <- numeric(nrow(df1_new))
df1_new$genes <- "/NA"
#df1_new$conditions <- factor(df1_new$conditions, levels = df1_new$conditions)
### Create a data.frame of the flow matrix sorted by flow size, to allow largest flow plotted first
df2_new<-summary[, c("set1", "set2", "gene_count", "intersection")]
df2_new<-arrange(df2_new, desc(gene_count))
### Plot links
for(k in 1:nrow(df2_new)){
#i,j reference of flow matrix
i<-match(df2_new$set1[k],df1_new$conditions)
j<-match(df2_new$set2[k],df1_new$conditions)
genes<-strsplit(subset(df2_new, set1 == df1_new$conditions[i] & set2 == df1_new$conditions[j])[,"intersection"], "/")[[1]]
#plot link
circos.link(sector.index1=df1_new$conditions[i],
point1=c(df1_new$sum2[i]-length(intersect(genes, strsplit(df1_new$genes[i], "/")[[1]])), df1_new$sum2[i] + df2_new$gene_count[k]),
sector.index2=df1_new$conditions[j],
point2=c(df1_new$sum2[j]-length(intersect(genes, strsplit(df1_new$genes[j], "/")[[1]])), df1_new$sum2[j] + df2_new$gene_count[k]),
col = df1_new$lcol[i], rou1 = 0.72, rou2 = 0.72)
df1_new$sum2[i]<-df1_new$sum2[i] + df2_new$gene_count[k] - length(intersect(genes, strsplit(df1_new$genes[i], "/")[[1]]))
df1_new$sum2[j]<-df1_new$sum2[j] + df2_new$gene_count[k] - length(intersect(genes, strsplit(df1_new$genes[j], "/")[[1]]))
df1_new$genes[i]<-paste(unique(c(strsplit(df1_new$genes[i], "/")[[1]], genes)), collapse = "/")
df1_new$genes[j]<-paste(unique(c(strsplit(df1_new$genes[j], "/")[[1]], genes)), collapse = "/")
}